####
# Author: M. Kenwick, S. Lee, B. Kolcak
# Purpose: Fightin' Words analysis
# Date: May 19, 2025 
####
rm(list=ls())
library(lme4)
library(dplyr)
library(ggplot2)
library(tidyr)
library(ggrepel)
library(quanteda)
library(stringr)
setwd('~/Dropbox/cmr_cong/replication/')

################################################################################
# 0 - Load functions from Monroe
# From https://burtmonroe.github.io/TextAsDataCourse/Tutorials/TADA-FightinWords.nb.html
################################################################################

fwgroups <- function(dtm, groups, pair = NULL, weights = rep(1,nrow(dtm)), k.prior = .1) {
  
  weights[is.na(weights)] <- 0
  
  weights <- weights/mean(weights)
  
  zero.doc <- rowSums(dtm)==0 | weights==0
  zero.term <- colSums(dtm[!zero.doc,])==0
  
  dtm.nz <- apply(dtm[!zero.doc,!zero.term],2,"*", weights[!zero.doc])
  
  g.prior <- tcrossprod(rowSums(dtm.nz),colSums(dtm.nz))/sum(dtm.nz)
  
  # 
  
  g.posterior <- as.matrix(dtm.nz + k.prior*g.prior)
  
  groups <- groups[!zero.doc]
  groups <- droplevels(groups)
  
  g.adtm <- as.matrix(aggregate(x=g.posterior,by=list(groups=groups),FUN=sum)[,-1])
  rownames(g.adtm) <- levels(groups)
  
  g.ladtm <- log(g.adtm)
  
  g.delta <- t(scale( t(scale(g.ladtm, center=T, scale=F)), center=T, scale=F))
  
  g.adtm_w <- -sweep(g.adtm,1,rowSums(g.adtm)) # terms not w spoken by k
  g.adtm_k <- -sweep(g.adtm,2,colSums(g.adtm)) # w spoken by groups other than k
  g.adtm_kw <- sum(g.adtm) - g.adtm_w - g.adtm_k - g.adtm # total terms not w or k 
  
  g.se <- sqrt(1/g.adtm + 1/g.adtm_w + 1/g.adtm_k + 1/g.adtm_kw)
  
  g.zeta <- g.delta/g.se
  
  g.counts <- as.matrix(aggregate(x=dtm.nz, by = list(groups=groups), FUN=sum)[,-1])
  
  if (!is.null(pair)) {
    pr.delta <- t(scale( t(scale(g.ladtm[pair,], center = T, scale =F)), center=T, scale=F))
    pr.adtm_w <- -sweep(g.adtm[pair,],1,rowSums(g.adtm[pair,]))
    pr.adtm_k <- -sweep(g.adtm[pair,],2,colSums(g.adtm[pair,])) # w spoken by groups other than k
    pr.adtm_kw <- sum(g.adtm[pair,]) - pr.adtm_w - pr.adtm_k - g.adtm[pair,] # total terms not w or k
    pr.se <- sqrt(1/g.adtm[pair,] + 1/pr.adtm_w + 1/pr.adtm_k + 1/pr.adtm_kw)
    pr.zeta <- pr.delta/pr.se
    
    return(list(zeta=pr.zeta[1,], delta=pr.delta[1,],se=pr.se[1,], counts = colSums(dtm.nz), acounts = colSums(g.adtm)))
  } else {
    return(list(zeta=g.zeta,delta=g.delta,se=g.se,counts=g.counts,acounts=g.adtm))
  }
}
# Plotting function
makeTransparent<-function(someColor, alpha=100)
{
  newColor<-col2rgb(someColor)
  apply(newColor, 2, function(curcoldata){rgb(red=curcoldata[1], green=curcoldata[2],
                                              blue=curcoldata[3],alpha=alpha, maxColorValue=255)})
}
fw.ggplot.groups <- function(fw.ch, groups.use = as.factor(rownames(fw.ch$zeta)), max.words = 50, max.countrank = 400, colorpalette=rep("black",length(groups.use)), sizescale=2, title="Comparison of Terms by Groups", subtitle = "", caption = "Group-specific terms are ordered by Fightin' Words statistic (Monroe, et al. 2008)") {
  if (is.null(dim(fw.ch$zeta))) {## two-group fw object consists of vectors, not matrices
    zetarankmat <- cbind(rank(-fw.ch$zeta),rank(fw.ch$zeta))
    colnames(zetarankmat) <- groups.use
    countrank <- rank(-(fw.ch$counts))
  } else {
    zetarankmat <- apply(-fw.ch$zeta[groups.use,],1,rank)
    countrank <- rank(-colSums(fw.ch$counts))
  }
  wideplotmat <- as_tibble(cbind(zetarankmat,countrank=countrank))
  wideplotmat$term=names(countrank)
  #rankplot <- gather(wideplotmat, party, zetarank, 1:ncol(zetarankmat))
  rankplot <- gather(wideplotmat, groups.use, zetarank, 1:ncol(zetarankmat))
  rankplot$plotsize <- sizescale*(50/(rankplot$zetarank))^(1/4)
  rankplot <- rankplot[rankplot$zetarank < max.words + 1 & rankplot$countrank<max.countrank+1,]
  rankplot$groups.use <- factor(rankplot$groups.use,levels=groups.use)
  
  p <- ggplot(rankplot, aes((nrow(rankplot)-countrank)^1, -(zetarank^1), colour=groups.use)) + 
    geom_point(show.legend=F,size=sizescale/2) + 
    theme_classic() +
    theme(axis.ticks=element_blank(), axis.text=element_blank() ) +
    ylim(-max.words,40) +
    facet_grid(groups.use ~ .) +
    geom_text_repel(aes(label = term), size = rankplot$plotsize, point.padding=.05,
                    box.padding = unit(0.20, "lines"), show.legend=F) +
    scale_colour_manual(values = alpha(colorpalette, .7)) + 
    #    labs(x="Terms used more frequently overall →", y="Terms used more frequently by group →",  title=title, subtitle=subtitle , caption = caption) 
    labs(x=paste("Terms used more frequently overall -->"), y=paste("Terms used more frequently by group -->"),  title=title, subtitle=subtitle , caption = caption) 
  
}
fw.keys <- function(fw.ch,n.keys=10) {
  n.groups <- nrow(fw.ch$zeta)
  keys <- matrix("",n.keys,n.groups)
  colnames(keys) <- rownames(fw.ch$zeta)
  
  for (g in 1:n.groups) {
    keys[,g] <- names(sort(fw.ch$zeta[g,],dec=T)[1:n.keys])
  }
  keys
}
####

################################################################################
# 1 - Load data
################################################################################
#speaker data
meta <- read.csv('master.csv')
meta$mil <- ifelse(meta$military_collapse>0, 1,0)

meta <- meta %>% 
  dplyr::filter(keyword_count>=4)

# Load speeches 
sp_107 <- read.delim('speech_data/speeches_107.txt', header = TRUE, sep = "|",quote = "", stringsAsFactors = F, encoding="UTF-8")
sp_108 <- read.delim('speech_data/speeches_108.txt', header = TRUE, sep = "|",quote = "", stringsAsFactors = F, encoding="UTF-8")
sp_109 <- read.delim('speech_data/speeches_109.txt', header = TRUE, sep = "|",quote = "", stringsAsFactors = F, encoding="UTF-8")
sp_110 <- read.delim('speech_data/speeches_110.txt', header = TRUE, sep = "|",quote = "", stringsAsFactors = F, encoding="UTF-8")
sp_111 <- read.delim('speech_data/speeches_111.txt', header = TRUE, sep = "|",quote = "", stringsAsFactors = F, encoding="UTF-8")
sp_112 <- read.delim('speech_data/speeches_112.txt', header = TRUE, sep = "|",quote = "", stringsAsFactors = F, encoding="UTF-8")
sp_113 <- read.delim('speech_data/speeches_113.txt', header = TRUE, sep = "|",quote = "", stringsAsFactors = F, encoding="UTF-8")

# Subset to Irq/Afg Speeches
speeches <- unique(meta$speech_id)
sp_107_2 <- sp_107[sp_107$speech_id %in% speeches,]
sp_108_2 <- sp_108[sp_108$speech_id %in% speeches,]
sp_109_2 <- sp_109[sp_109$speech_id %in% speeches,]
sp_110_2 <- sp_110[sp_110$speech_id %in% speeches,]
sp_111_2 <- sp_111[sp_111$speech_id %in% speeches,]
sp_112_2 <- sp_112[sp_112$speech_id %in% speeches,]
sp_113_2 <- sp_113[sp_113$speech_id %in% speeches,]

rm(sp_107, sp_108, sp_109, sp_110, sp_111, sp_112, sp_113)
speeches<-rbind(sp_107_2, sp_108_2, sp_109_2, sp_110_2,
                sp_111_2, sp_112_2, sp_113_2)

meta$party[meta$party=="I"] <- "D" # Lumping independents w/ Democrats
data <- merge(speeches, dplyr::select(meta,speech_id,party,speakerid,mil), 
              by =("speech_id"), all.x= TRUE, all.y = FALSE)
data<-tibble(data)
#Remove punctuation etc. 
data$speech <- str_replace_all(data$speech, "[[:punct:]]", " ")
data$length <- nchar(data$speech)

#### Prepare Data for FW 
corp <- corpus(data, text_field="speech", docid_field="speech_id")
tok <- tokens(corp, remove_numbers=T, remove_punct=T,
              remove_symbols=T) %>% 
  tokens_remove(stopwords(source = "snowball")) %>% 
  tokens_wordstem() %>% 
  tokens_ngrams(1)
dfm.full <-  dfm(tok,  verbose=T,tolower = T)
dfmtrimmed <- dfm_trim(dfm.full, min_docfreq = 30, min_termfreq = 50, verbose = TRUE)

# Groups
data$groups<-"Other"
data$groups[data$mil==0 & data$party=="D" ]<-"Dem., Non-veteran"
data$groups[data$mil==1 & data$party=="D" ]<-"Dem., Veteran"
data$groups[data$mil==0 & data$party=="R" ]<-"Rep., Non-Veteran"
data$groups[data$mil==1 & data$party=="R" ]<-"Rep., Veteran"

# Generate Figure 4
fw_test2 <- fwgroups(dfmtrimmed,groups = as.factor(data$groups))
fwkeys_test2 <- fw.keys(fw_test2, n.keys=15)
plot_test2<-fw.ggplot.groups(fw_test2,sizescale=4,max.words=200,max.countrank=400, colorpalette=c("steelblue","darkolivegreen4","orangered","darkolivegreen4"))

pdf("plots/fig4.pdf",height=9,width=12)
plot_test2
dev.off()

